# install.packages(c("dplyr","ggplot","plotly","htmlwidgets"))
library(dplyr)
library(ggplot2)
library(plotly)
library(htmlwidgets)
data <- read.csv(here::here("Raw_Data","film.csv"),sep = ';')
# take a look at first couple rows
head(data)
## Year Length Title Subject Actor
## 1 INT INT STRING CAT CAT
## 2 1990 111 Tie Me Up! Tie Me Down! Comedy Banderas, Antonio
## 3 1991 113 High Heels Comedy Bosé, Miguel
## 4 1983 104 Dead Zone, The Horror Walken, Christopher
## 5 1979 122 Cuba Action Connery, Sean
## 6 1978 94 Days of Heaven Drama Gere, Richard
## Actress Director Popularity Awards X.Image
## 1 CAT CAT INT BOOL STRING
## 2 Abril, Victoria Almodóvar, Pedro 68 No NicholasCage.png
## 3 Abril, Victoria Almodóvar, Pedro 68 No NicholasCage.png
## 4 Adams, Brooke Cronenberg, David 79 No NicholasCage.png
## 5 Adams, Brooke Lester, Richard 6 No seanConnery.png
## 6 Adams, Brooke Malick, Terrence 14 No NicholasCage.png
# remove 1st row and last column
clean_data <- data %>% filter(!(Year == "INT")) %>%
select(!("X.Image"))
# check if it worked
dim(data)
## [1] 1660 10
dim(clean_data)
## [1] 1659 9
# check class types for each column
lapply(clean_data,class)
## $Year
## [1] "character"
##
## $Length
## [1] "character"
##
## $Title
## [1] "character"
##
## $Subject
## [1] "character"
##
## $Actor
## [1] "character"
##
## $Actress
## [1] "character"
##
## $Director
## [1] "character"
##
## $Popularity
## [1] "character"
##
## $Awards
## [1] "character"
# change class types
clean_data[,c(1:2,8)] <- lapply(clean_data[,c(1:2,8)], as.integer)
# check class types to see if it worked
lapply(clean_data,class)
## $Year
## [1] "integer"
##
## $Length
## [1] "integer"
##
## $Title
## [1] "character"
##
## $Subject
## [1] "character"
##
## $Actor
## [1] "character"
##
## $Actress
## [1] "character"
##
## $Director
## [1] "character"
##
## $Popularity
## [1] "integer"
##
## $Awards
## [1] "character"
# replace empty cells with NA
clean_data[clean_data == ""] <- NA
# check if it worked
table(is.na(clean_data))
##
## FALSE TRUE
## 14217 714
F1. Does the proportion of movies made in each subject, change throughout time?
F1 <- clean_data %>% filter(!(is.na(Year) | is.na(Subject))) %>%
group_by(Year,Subject) %>%
summarise(cat_n = n()) %>%
mutate(prop_cat = (cat_n / sum(cat_n))*100)
F2. Compare the popularity score for movies that did and did not receive awards.
F2 <- clean_data %>% filter(!(is.na(Awards) | is.na(Popularity)))
# make our figure 1
Fig1 <- ggplot(F1, aes(fill=Subject, y=prop_cat, x=Year)) +
geom_bar(position="fill", stat="identity") +
ylab("% of Movies") +
labs(fill = "Genre") +
ggtitle("Proportion of Movie Genres Over Time")
Fig1
# make our figure 2
Fig2 <- ggplot(F2, aes(x=Awards, y=Popularity, fill = Awards)) +
geom_boxplot() +
xlab("Whether or Not Movie Recieved Awards") +
ylab("Popularity Score") +
ggtitle("Critical Praise vs General Popularity for Movies")
Fig2
# making Figure 1 interactive
Fig1i <- ggplot(F1, aes(fill=Subject, y=prop_cat, x=Year,
text = paste(
"Year: ", Year, "\n",
"% of Movies: ", round(prop_cat, digits = 0), "\n",
"Genre: ", Subject, "\n"
))) +
geom_bar(position="fill", stat="identity") +
ylab("Proportion of Movies") +
labs(fill = "Genre") +
ggtitle("Proportion of Movie Genres Over Time")
Fig1i <- ggplotly(Fig1i, tooltip = "text")
Fig1i
Fig2i <- ggplotly(Fig2)
Fig2i
# save as static image files
ggsave(here::here("Figures","F1.png"))
ggsave(here::here("Figures","F2.png"))
# save as interactive html files
htmltools::save_html(Fig1i, file= here::here("Figures","Fig1i.html"))
htmltools::save_html(Fig2i, file= here::here("Figures","Fig2i.html"))
# save as r data objects to embed in other R products (xaringan slides)
save(Fig1i, file = here::here("Figures","Fig1i.rda"))
save(Fig2i, file = here::here("Figures","Fig2i.rda"))